import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('american_bankruptcy.csv',names=['company_name','status','year','Current assets','Cost of goods sold','Depreciation and amortization','EBITDA','Inventory','Net Income','Total Receivables','Market Value','Net Sales','Total Assets','Total Long-term Debt','EBIT','Gross Profit','Total Current Liabilities','Retained Earnings','Total Revenue','Total Liabilities','Total Operating Expenses'],skiprows=[0])
df
| company_name | status | year | Current assets | Cost of goods sold | Depreciation and amortization | EBITDA | Inventory | Net Income | Total Receivables | ... | Net Sales | Total Assets | Total Long-term Debt | EBIT | Gross Profit | Total Current Liabilities | Retained Earnings | Total Revenue | Total Liabilities | Total Operating Expenses | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | C_1 | alive | 1999 | 511.267 | 833.107 | 18.373 | 89.031 | 336.018 | 35.163 | 128.348 | ... | 1024.333 | 740.998 | 180.447 | 70.658 | 191.226 | 163.816 | 201.026 | 1024.333 | 401.483 | 935.302 |
| 1 | C_1 | alive | 2000 | 485.856 | 713.811 | 18.577 | 64.367 | 320.590 | 18.531 | 115.187 | ... | 874.255 | 701.854 | 179.987 | 45.790 | 160.444 | 125.392 | 204.065 | 874.255 | 361.642 | 809.888 |
| 2 | C_1 | alive | 2001 | 436.656 | 526.477 | 22.496 | 27.207 | 286.588 | -58.939 | 77.528 | ... | 638.721 | 710.199 | 217.699 | 4.711 | 112.244 | 150.464 | 139.603 | 638.721 | 399.964 | 611.514 |
| 3 | C_1 | alive | 2002 | 396.412 | 496.747 | 27.172 | 30.745 | 259.954 | -12.410 | 66.322 | ... | 606.337 | 686.621 | 164.658 | 3.573 | 109.590 | 203.575 | 124.106 | 606.337 | 391.633 | 575.592 |
| 4 | C_1 | alive | 2003 | 432.204 | 523.302 | 26.680 | 47.491 | 247.245 | 3.504 | 104.661 | ... | 651.958 | 709.292 | 248.666 | 20.811 | 128.656 | 131.261 | 131.884 | 651.958 | 407.608 | 604.467 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 78677 | C_8971 | alive | 2014 | 233.211 | 43.338 | 14.094 | 45.615 | 3.376 | 25.261 | 22.846 | ... | 104.223 | 1099.101 | 184.666 | 31.521 | 60.885 | 28.197 | 28.095 | 104.223 | 225.887 | 58.608 |
| 78678 | C_8971 | alive | 2015 | 105.559 | 59.184 | 42.592 | 202.133 | 2.288 | 129.688 | 54.611 | ... | 291.153 | 1865.926 | 770.103 | 159.541 | 231.969 | 88.128 | 157.783 | 291.153 | 880.327 | 89.020 |
| 78679 | C_8971 | alive | 2016 | 63.971 | 69.074 | 65.057 | 79.051 | 2.581 | -1.442 | 42.467 | ... | 169.858 | 1746.235 | 683.985 | 13.994 | 100.784 | 85.765 | 156.341 | 169.858 | 770.233 | 90.807 |
| 78680 | C_8971 | alive | 2017 | 135.207 | 66.527 | 65.330 | 69.171 | 2.013 | -20.401 | 27.217 | ... | 161.884 | 1736.110 | 694.035 | 3.841 | 95.357 | 82.010 | 135.941 | 161.884 | 776.697 | 92.713 |
| 78681 | C_8971 | alive | 2018 | 82.589 | 68.817 | 65.201 | 67.262 | 2.112 | -50.946 | 45.839 | ... | 160.513 | 1625.370 | 632.122 | 2.061 | 91.696 | 79.365 | 84.995 | 160.513 | 712.687 | 93.251 |
78682 rows × 21 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 78682 entries, 0 to 78681 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 company_name 78682 non-null object 1 status 78682 non-null object 2 year 78682 non-null int64 3 Current assets 78682 non-null float64 4 Cost of goods sold 78682 non-null float64 5 Depreciation and amortization 78682 non-null float64 6 EBITDA 78682 non-null float64 7 Inventory 78682 non-null float64 8 Net Income 78682 non-null float64 9 Total Receivables 78682 non-null float64 10 Market Value 78682 non-null float64 11 Net Sales 78682 non-null float64 12 Total Assets 78682 non-null float64 13 Total Long-term Debt 78682 non-null float64 14 EBIT 78682 non-null float64 15 Gross Profit 78682 non-null float64 16 Total Current Liabilities 78682 non-null float64 17 Retained Earnings 78682 non-null float64 18 Total Revenue 78682 non-null float64 19 Total Liabilities 78682 non-null float64 20 Total Operating Expenses 78682 non-null float64 dtypes: float64(18), int64(1), object(2) memory usage: 12.6+ MB
df.drop('company_name',inplace=True,axis=1)
df.describe()
| year | Current assets | Cost of goods sold | Depreciation and amortization | EBITDA | Inventory | Net Income | Total Receivables | Market Value | Net Sales | Total Assets | Total Long-term Debt | EBIT | Gross Profit | Total Current Liabilities | Retained Earnings | Total Revenue | Total Liabilities | Total Operating Expenses | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 | 7.868200e+04 | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 | 78682.000000 |
| mean | 2007.506317 | 880.362485 | 1594.529029 | 121.234256 | 376.759424 | 201.605717 | 129.382453 | 286.832743 | 3.414355e+03 | 2364.019706 | 2867.110620 | 722.483710 | 255.525035 | 769.490783 | 610.072255 | 532.467069 | 2364.019706 | 1773.563963 | 1987.260307 |
| std | 5.742768 | 3928.564794 | 8930.484664 | 652.376804 | 2012.023142 | 1060.766096 | 1265.532022 | 1335.978571 | 1.841410e+04 | 11950.068842 | 12917.944421 | 3242.170946 | 1494.643534 | 3774.703114 | 2938.387443 | 6369.159440 | 11950.068842 | 8053.684902 | 10419.629038 |
| min | 1999.000000 | -7.760000 | -366.645000 | 0.000000 | -21913.000000 | 0.000000 | -98696.000000 | -0.006000 | 1.000000e-04 | -1964.999000 | 0.001000 | -0.023000 | -25913.000000 | -21536.000000 | 0.001000 | -102362.000000 | -1964.999000 | 0.001000 | -317.197000 |
| 25% | 2002.000000 | 18.924000 | 17.038250 | 1.192000 | -0.811000 | 0.000000 | -7.415750 | 3.281250 | 3.498000e+01 | 27.548500 | 37.363500 | 0.000000 | -2.787000 | 8.521250 | 8.889250 | -68.282750 | 27.548500 | 13.486000 | 32.872500 |
| 50% | 2007.000000 | 100.449500 | 103.661000 | 7.929500 | 15.034500 | 7.023000 | 1.616000 | 22.820000 | 2.275118e+02 | 186.598500 | 213.203500 | 7.593500 | 6.518000 | 63.581500 | 43.333000 | -1.131000 | 186.598500 | 81.988000 | 168.912000 |
| 75% | 2012.000000 | 431.526750 | 634.548000 | 47.971750 | 139.655250 | 74.747250 | 40.144250 | 131.580500 | 1.244890e+03 | 1046.402500 | 1171.364750 | 248.760750 | 87.599000 | 344.074250 | 222.817000 | 146.070000 | 1046.402500 | 629.975000 | 875.522250 |
| max | 2018.000000 | 169662.000000 | 374623.000000 | 28430.000000 | 81730.000000 | 62567.000000 | 104821.000000 | 65812.000000 | 1.073391e+06 | 511729.000000 | 531864.000000 | 166250.000000 | 71230.000000 | 137106.000000 | 116866.000000 | 402089.000000 | 511729.000000 | 337980.000000 | 481580.000000 |
df['status'].unique()
array(['alive', 'failed'], dtype=object)
df.isnull().sum()
status 0 year 0 Current assets 0 Cost of goods sold 0 Depreciation and amortization 0 EBITDA 0 Inventory 0 Net Income 0 Total Receivables 0 Market Value 0 Net Sales 0 Total Assets 0 Total Long-term Debt 0 EBIT 0 Gross Profit 0 Total Current Liabilities 0 Retained Earnings 0 Total Revenue 0 Total Liabilities 0 Total Operating Expenses 0 dtype: int64
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(),annot=True,fmt='.2f')
plt.show()
plt.figure(figsize=(15,10))
sns.pairplot(data=df)
plt.show()
<Figure size 1500x1000 with 0 Axes>
sns.countplot(df['status'])
<AxesSubplot:xlabel='status', ylabel='count'>
# ITS A HIGHLY IMBALANCED DATASET
from sklearn.svm import SVC
svc=SVC()
X=df.drop('status',axis=1)
y=df['status']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=50)
from sklearn.preprocessing import StandardScaler
mms=StandardScaler()
X_train=mms.fit_transform(X_train)
X_test=mms.transform(X_test)
svc.fit(X_train,y_train)
SVC()
y_pred=svc.predict(X_train)
y_test_pred=svc.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print('TRAIN DATA')
print(classification_report(y_train,y_pred))
print(confusion_matrix(y_train,y_pred))
print('TEST DATA')
print(classification_report(y_test,y_test_pred))
print(confusion_matrix(y_test,y_test_pred))
TRAIN DATA
precision recall f1-score support
alive 0.93 1.00 0.97 55042
failed 0.97 0.01 0.02 3969
accuracy 0.93 59011
macro avg 0.95 0.50 0.49 59011
weighted avg 0.94 0.93 0.90 59011
[[55041 1]
[ 3934 35]]
TEST DATA
precision recall f1-score support
alive 0.94 1.00 0.97 18420
failed 0.86 0.00 0.01 1251
accuracy 0.94 19671
macro avg 0.90 0.50 0.49 19671
weighted avg 0.93 0.94 0.91 19671
[[18419 1]
[ 1245 6]]
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(X_train,y_train)
LogisticRegression()
y_pred=log.predict(X_train)
y_test_pred=log.predict(X_test)
print(classification_report(y_train,y_pred))
print(classification_report(y_test,y_test_pred))
precision recall f1-score support
alive 0.93 1.00 0.97 55042
failed 0.50 0.01 0.02 3969
accuracy 0.93 59011
macro avg 0.72 0.50 0.49 59011
weighted avg 0.90 0.93 0.90 59011
precision recall f1-score support
alive 0.94 1.00 0.97 18420
failed 0.41 0.01 0.01 1251
accuracy 0.94 19671
macro avg 0.67 0.50 0.49 19671
weighted avg 0.90 0.94 0.91 19671
from xgboost import XGBClassifier
xgb=XGBClassifier()
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['status']=le.fit_transform(df['status'])
X=df.drop('status',axis=1)
y=df['status']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=50)
from sklearn.preprocessing import StandardScaler
mms=StandardScaler()
X_train=mms.fit_transform(X_train)
X_test=mms.transform(X_test)
xgb.fit(X_train,y_train)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)
y_pred=xgb.predict(X_train)
y_test_pred=xgb.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print('TRAIN DATA')
print(classification_report(y_train,y_pred))
print(confusion_matrix(y_train,y_pred))
TRAIN DATA
precision recall f1-score support
0 0.96 1.00 0.98 55042
1 0.99 0.37 0.54 3969
accuracy 0.96 59011
macro avg 0.97 0.68 0.76 59011
weighted avg 0.96 0.96 0.95 59011
[[55030 12]
[ 2507 1462]]
print('TEST DATA')
print(classification_report(y_test,y_test_pred))
print(confusion_matrix(y_test,y_test_pred))
TEST DATA
precision recall f1-score support
0 0.94 1.00 0.97 18420
1 0.69 0.12 0.20 1251
accuracy 0.94 19671
macro avg 0.82 0.56 0.59 19671
weighted avg 0.93 0.94 0.92 19671
[[18355 65]
[ 1103 148]]
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X, y= smote.fit_resample(X, y)
print(y.value_counts())
0 73462 1 73462 Name: status, dtype: int64
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=50)
from sklearn.preprocessing import MinMaxScaler
mms=StandardScaler()
X_train=mms.fit_transform(X_train)
X_test=mms.transform(X_test)
xgb.fit(X_train,y_train)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)
y_pred=xgb.predict(X_train)
y_test_pred=xgb.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_train,y_pred))
print(confusion_matrix(y_train,y_pred))
precision recall f1-score support
0 0.94 0.87 0.90 55219
1 0.88 0.94 0.91 54974
accuracy 0.90 110193
macro avg 0.91 0.90 0.90 110193
weighted avg 0.91 0.90 0.90 110193
[[47948 7271]
[ 3274 51700]]
print(classification_report(y_test,y_test_pred))
print(confusion_matrix(y_test,y_test_pred))
precision recall f1-score support
0 0.91 0.83 0.87 18243
1 0.85 0.92 0.88 18488
accuracy 0.88 36731
macro avg 0.88 0.88 0.88 36731
weighted avg 0.88 0.88 0.88 36731
[[15176 3067]
[ 1467 17021]]
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
param_grid = {
'learning_rate': [0.1, 0.2, 0.3],
'max_depth': [3, 4, 5,15,20,35,50],
'n_estimators': [100, 200, 300,400,500]
}
random=RandomizedSearchCV(xgb,param_distributions=param_grid,cv=5)
random.fit(X_train,y_train)
RandomizedSearchCV(cv=5,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None, feature_types=None,
gamma=None, gpu_id=None,
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate...
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None,
predictor=None, random_state=None, ...),
param_distributions={'learning_rate': [0.1, 0.2, 0.3],
'max_depth': [3, 4, 5, 15, 20, 35, 50],
'n_estimators': [100, 200, 300, 400,
500]})
random.best_params_
{'n_estimators': 300, 'max_depth': 15, 'learning_rate': 0.2}
y_pred=random.predict(X_train)
y_test_pred=random.predict(X_test)
print('TRAIN DATA')
print(classification_report(y_train,y_pred))
print(confusion_matrix(y_train,y_pred))
print('TEST DATA')
print(classification_report(y_test,y_test_pred))
print(confusion_matrix(y_test,y_test_pred))
TRAIN DATA
precision recall f1-score support
0 1.00 1.00 1.00 55219
1 1.00 1.00 1.00 54974
accuracy 1.00 110193
macro avg 1.00 1.00 1.00 110193
weighted avg 1.00 1.00 1.00 110193
[[55219 0]
[ 0 54974]]
TEST DATA
precision recall f1-score support
0 0.98 0.95 0.97 18243
1 0.95 0.99 0.97 18488
accuracy 0.97 36731
macro avg 0.97 0.97 0.97 36731
weighted avg 0.97 0.97 0.97 36731
[[17295 948]
[ 277 18211]]
grid=GridSearchCV(xgb,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)
y_pred=random.predict(X_train)
y_test_pred=random.predict(X_test)
print('TRAIN DATA')
print(classification_report(y_train,y_pred))
print(confusion_matrix(y_train,y_pred))
print('TEST DATA')
print(classification_report(y_test,y_test_pred))
print(confusion_matrix(y_test,y_test_pred))
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()
dtc.fit(X_train,y_train)
DecisionTreeClassifier()
y_pred=dtc.predict(X_train)
y_test_pred=dtc.predict(X_test)
print(classification_report(y_train,y_pred))
print(classification_report(y_test,y_test_pred))
precision recall f1-score support
0 1.00 1.00 1.00 55219
1 1.00 1.00 1.00 54974
accuracy 1.00 110193
macro avg 1.00 1.00 1.00 110193
weighted avg 1.00 1.00 1.00 110193
precision recall f1-score support
0 0.90 0.86 0.88 18243
1 0.87 0.91 0.89 18488
accuracy 0.89 36731
macro avg 0.89 0.88 0.89 36731
weighted avg 0.89 0.89 0.89 36731
from sklearn.ensemble import RandomForestClassifier
rtc=RandomForestClassifier()
rtc.fit(X_train,y_train)
RandomForestClassifier()
y_pred=rtc.predict(X_train)
y_test_pred=rtc.predict(X_test)
print('TRAIN DATA')
print(classification_report(y_train,y_pred))
print('TEST DATA')
print(classification_report(y_test,y_test_pred))
TRAIN DATA
precision recall f1-score support
0 1.00 1.00 1.00 55219
1 1.00 1.00 1.00 54974
accuracy 1.00 110193
macro avg 1.00 1.00 1.00 110193
weighted avg 1.00 1.00 1.00 110193
TEST DATA
precision recall f1-score support
0 0.97 0.94 0.96 18243
1 0.95 0.97 0.96 18488
accuracy 0.96 36731
macro avg 0.96 0.96 0.96 36731
weighted avg 0.96 0.96 0.96 36731
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier()
gbc.fit(X_train,y_train)
GradientBoostingClassifier()
y_pred=gbc.predict(X_train)
y_test_pred=gbc.predict(X_test)
print(classification_report(y_train,y_pred))
print(classification_report(y_test,y_test_pred))
precision recall f1-score support
0 0.79 0.70 0.74 55219
1 0.73 0.81 0.77 54974
accuracy 0.76 110193
macro avg 0.76 0.76 0.75 110193
weighted avg 0.76 0.76 0.75 110193
precision recall f1-score support
0 0.78 0.69 0.73 18243
1 0.73 0.81 0.76 18488
accuracy 0.75 36731
macro avg 0.75 0.75 0.75 36731
weighted avg 0.75 0.75 0.75 36731
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
bagging_rf = BaggingClassifier()
bagging_rf.fit(X_train, y_train)
y_train_pred = bagging_rf.predict(X_train)
y_test_pred = bagging_rf.predict(X_test)
print('TRAIN DATA')
print(classification_report(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))
print('TEST DATA')
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
TRAIN DATA
precision recall f1-score support
0 1.00 1.00 1.00 55219
1 1.00 1.00 1.00 54974
accuracy 1.00 110193
macro avg 1.00 1.00 1.00 110193
weighted avg 1.00 1.00 1.00 110193
[[55132 87]
[ 154 54820]]
TEST DATA
precision recall f1-score support
0 0.94 0.93 0.93 18243
1 0.93 0.94 0.94 18488
accuracy 0.93 36731
macro avg 0.93 0.93 0.93 36731
weighted avg 0.93 0.93 0.93 36731
[[16885 1358]
[ 1048 17440]]
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_train)
y_test_pred=knn.predict(X_test)
print('TRAIN DATA')
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))
print('TEST DATA')
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test,y_test_pred))
TRAIN DATA
precision recall f1-score support
0 0.94 1.00 0.97 55042
1 0.84 0.06 0.11 3969
accuracy 0.94 59011
macro avg 0.89 0.53 0.54 59011
weighted avg 0.93 0.94 0.91 59011
[[54997 45]
[ 3727 242]]
TEST DATA
precision recall f1-score support
0 0.94 1.00 0.97 18420
1 0.58 0.04 0.07 1251
accuracy 0.94 19671
macro avg 0.76 0.52 0.52 19671
weighted avg 0.92 0.94 0.91 19671
[[18385 35]
[ 1203 48]]
from sklearn.decomposition import PCA
sc=StandardScaler()
pc=PCA(n_components=5)
X_s=sc.fit_transform(X)
pc.fit(X_s)
X_t=pc.transform(X_s)
pc.components_
array([[ 3.04407239e-02, 2.46351830e-01, 2.33376880e-01,
2.31093400e-01, 2.57998487e-01, 1.92628561e-01,
1.84881156e-01, 2.34769728e-01, 2.29577376e-01,
2.55965015e-01, 2.57974718e-01, 2.00178147e-01,
2.44228457e-01, 2.59758623e-01, 2.59613105e-01,
1.80810650e-01, 2.55965015e-01, 2.46606960e-01,
2.43219319e-01],
[-7.89016614e-02, -3.69441023e-02, 3.88919607e-01,
-2.19587414e-01, -2.04997153e-01, 4.40563284e-01,
-1.43399796e-01, -9.64405842e-02, -1.75708841e-01,
2.82335064e-01, -1.99417682e-01, -3.01603814e-01,
-1.77820372e-01, -2.81449311e-02, 2.24064045e-02,
2.08314199e-02, 2.82335064e-01, -2.04697413e-01,
3.61479110e-01],
[ 2.05593827e-01, -1.19556086e-02, 5.20985735e-02,
1.85292311e-01, -1.24751172e-01, 1.03766579e-01,
-5.03771569e-01, 4.83522138e-02, -2.51522888e-01,
2.31213010e-02, 1.45804532e-01, 4.30694008e-01,
-2.52170173e-01, -5.07578641e-02, 1.19242820e-01,
-4.49459860e-01, 2.31213010e-02, 2.86500604e-01,
5.01561586e-02],
[-9.73653751e-01, -9.32582574e-03, -4.23245051e-03,
8.33557900e-02, 7.33074198e-03, -3.72080398e-02,
-9.90207848e-02, 2.92046207e-02, -4.44900671e-02,
-2.99789559e-03, 5.78288705e-02, 9.90671971e-02,
-2.77224069e-02, 5.44902512e-04, 3.90316858e-02,
-8.93395382e-02, -2.99789559e-03, 8.60010882e-02,
-4.81946065e-03],
[ 2.66300757e-02, -4.58617949e-01, 1.66619838e-01,
3.93734950e-01, 5.33743164e-02, -2.35606253e-01,
-8.11623326e-02, -1.87641645e-01, -2.62680309e-01,
8.87090796e-02, 1.26209026e-01, 8.53193156e-02,
-1.05621285e-01, -1.15140322e-01, -1.93012111e-01,
5.67440125e-01, 8.87090796e-02, 7.81254471e-02,
9.11528551e-02]])
pc.explained_variance_ratio_
array([0.68748537, 0.08125736, 0.06409237, 0.05140839, 0.0298216 ])
plt.scatter(X_t[:,0],X_t[:,1],c=y)
plt.show()
plt.bar(np.arange(1,6),pc.explained_variance_ratio_)
plt.plot(np.arange(1,6),np.cumsum(pc.explained_variance_ratio_))
[<matplotlib.lines.Line2D at 0x22026410100>]
models = {
'SVM': SVC(),
'Logistic Regression': LogisticRegression(),
'XGBoost': XGBClassifier(),
'Decision Tree': DecisionTreeClassifier(),
'Random Forest': RandomForestClassifier(),
'Gradient Boosting': GradientBoostingClassifier(),
'Bagging': BaggingClassifier(),
'KNN': KNeighborsClassifier(n_neighbors=9)
}
for i,j in models.items():
print(f'Model: {i}')
if i == 'XGBoost':
random = RandomizedSearchCV(j, param_distributions=param_grid, cv=5)
random.fit(X_train, y_train)
j = random.best_estimator_
j.fit(X_train,y_train)
y_train_pred = j.predict(X_train)
y_test_pred = j.predict(X_test)
print("TRAIN DATA")
print(classification_report(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))
print("TEST DATA")
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
Model: SVM
TRAIN DATA
precision recall f1-score support
0 0.77 0.48 0.60 55219
1 0.62 0.86 0.72 54974
accuracy 0.67 110193
macro avg 0.70 0.67 0.66 110193
weighted avg 0.70 0.67 0.66 110193
[[26756 28463]
[ 7773 47201]]
TEST DATA
precision recall f1-score support
0 0.77 0.48 0.59 18243
1 0.63 0.86 0.72 18488
accuracy 0.67 36731
macro avg 0.70 0.67 0.66 36731
weighted avg 0.70 0.67 0.66 36731
[[ 8823 9420]
[ 2634 15854]]
Model: Logistic Regression
TRAIN DATA
precision recall f1-score support
0 0.66 0.57 0.61 55219
1 0.62 0.71 0.66 54974
accuracy 0.64 110193
macro avg 0.64 0.64 0.64 110193
weighted avg 0.64 0.64 0.64 110193
[[31368 23851]
[16037 38937]]
TEST DATA
precision recall f1-score support
0 0.66 0.57 0.61 18243
1 0.62 0.71 0.66 18488
accuracy 0.64 36731
macro avg 0.64 0.64 0.63 36731
weighted avg 0.64 0.64 0.63 36731
[[10322 7921]
[ 5429 13059]]
Model: XGBoost